R Markdown

cancer_df = read_csv("./data/Cancer_Registry.csv") %>% 
  janitor::clean_names() %>% 
  select(target_death_rate, everything()) %>% 
  separate(geography, c("county", "state"), sep = ",") %>% 
  mutate(county = as.factor(county), 
         state = as.factor(state),
         pct_case_count = avg_ann_count / pop_est2015) %>% 
  select(target_death_rate, pct_case_count, everything())
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   avgDeathsPerYear = col_integer(),
##   medIncome = col_integer(),
##   popEst2015 = col_integer(),
##   binnedInc = col_character(),
##   Geography = col_character()
## )
## See spec(...) for full column specifications.

Percentage of annul case dignosed count plot

plot_count_pct =
  cancer_df %>% 
  ggplot(aes(y = pct_case_count, x = target_death_rate, color = state)) +
  geom_point() 
  #geom_smooth(se = F)
ggplotly(plot_count_pct)

Incidence rate plot

plot_incidence = 
  cancer_df %>% 
  ggplot(aes(x = incidence_rate, y = target_death_rate, color = state)) +
  geom_point() 
  #geom_smooth(se = F)
  ggplotly(plot_incidence)
# Influential points in the dataset, state Flordia and Virginia.

Income plot

plot_income = 
  cancer_df %>% 
  ggplot(aes(x = med_income, y = target_death_rate, color = state)) +
  geom_point() 
  #geom_smooth(se = F)
  ggplotly(plot_income)

Age plots

plot_age_1 =
  cancer_df %>% 
  ggplot(aes(x = median_age, y = target_death_rate)) +
  geom_point() +
  geom_smooth(se = F)
ggplotly(plot_age_1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# error data in this column, larger than 100

plot_age_2 =
  cancer_df %>% 
  ggplot(aes(x = median_age_male, y = target_death_rate)) +
  geom_point() +
  geom_smooth(se = F)
ggplotly(plot_age_2)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
plot_age_3 =
  cancer_df %>% 
  ggplot(aes(x = median_age_female, y = target_death_rate)) +
  geom_point() +
  geom_smooth(se = F)
ggplotly(plot_age_3)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
cancer_df %>% 
  select(-county, -state, -binned_inc) %>% 
  cor() %>% 
  as.tibble() 
## # A tibble: 33 x 33
##    target_death_ra… pct_case_count avg_ann_count avg_deaths_per_…
##               <dbl>          <dbl>         <dbl>            <dbl>
##  1          1             -0.0578        -0.144           -0.0907
##  2         -0.0578         1              0.161           -0.0589
##  3         -0.144          0.161          1                0.939 
##  4         -0.0907        -0.0589         0.939            1     
##  5          0.449          0.0225         0.0736           0.0627
##  6         -0.429          0.0291         0.269            0.223 
##  7         -0.120         -0.0518         0.927            0.978 
##  8          0.429         -0.123         -0.136           -0.0669
##  9         -0.0223        -0.00481        0.0821           0.0635
## 10          0.00438        0.0375        -0.0241          -0.0246
## # ... with 23 more rows, and 29 more variables: incidence_rate <dbl>,
## #   med_income <dbl>, pop_est2015 <dbl>, poverty_percent <dbl>,
## #   study_per_cap <dbl>, median_age <dbl>, median_age_male <dbl>,
## #   median_age_female <dbl>, avg_household_size <dbl>,
## #   percent_married <dbl>, pct_no_hs18_24 <dbl>, pct_hs18_24 <dbl>,
## #   pct_some_col18_24 <dbl>, pct_bach_deg18_24 <dbl>, pct_hs25_over <dbl>,
## #   pct_bach_deg25_over <dbl>, pct_employed16_over <dbl>,
## #   pct_unemployed16_over <dbl>, pct_private_coverage <dbl>,
## #   pct_private_coverage_alone <dbl>, pct_emp_priv_coverage <dbl>,
## #   pct_public_coverage <dbl>, pct_public_coverage_alone <dbl>,
## #   pct_white <dbl>, pct_black <dbl>, pct_asian <dbl>,
## #   pct_other_race <dbl>, pct_married_households <dbl>, birth_rate <dbl>